`"text": true` should have meant ".//text()", not "text()".

Akinori MUSHA 10 years ago
parent
commit
7b6119f1f2

+ 2 - 4
app/models/agents/website_agent.rb

@@ -28,10 +28,10 @@ module Agents
28 28
           "extract": {
29 29
             "url": { "css": "#comic img", "value": "@src" },
30 30
             "title": { "css": "#comic img", "value": "@title" },
31
-            "body_text": { "css": "div.main", "value": "text()" }
31
+            "body_text": { "css": "div.main", "value": ".//text()" }
32 32
           }
33 33
 
34
-      "@_attr_" is the XPath expression to extract the value of an attribute named _attr_ from a node, and "text()" is to extract the enclosed text.  You can also use [XPath functions](http://www.w3.org/TR/xpath/#section-String-Functions) like `normalize-space` to strip and squeeze whitespace, `substring-after` to extract part of a text, and `translate` to remove comma from a formatted number, etc.
34
+      "@_attr_" is the XPath expression to extract the value of an attribute named _attr_ from a node, and "//text()" is to extract all the enclosed texts.  You can also use [XPath functions](http://www.w3.org/TR/xpath/#section-String-Functions) like `normalize-space` to strip and squeeze whitespace, `substring-after` to extract part of a text, and `translate` to remove comma from a formatted number, etc.  Note that these functions take a string, not a node set, so what you may think would be written as `normalize-text(.//text())` should actually be `normalize-text(.)`.
35 35
 
36 36
       When parsing JSON, these sub-hashes specify [JSONPaths](http://goessner.net/articles/JsonPath/) to the values that you care about.  For example:
37 37
 
@@ -162,8 +162,6 @@ module Agents
162 162
                       # Node#xpath() returns any numeric value as float;
163 163
                       # convert it to integer as appropriate.
164 164
                       value = value.to_i if value.to_i == value
165
-                    when Nokogiri::XML::NodeSet
166
-                      value = value.first
167 165
                     end
168 166
                     value.to_s
169 167
                   }

+ 1 - 1
db/migrate/20140723110551_adopt_xpath_in_website_agent.rb

@@ -7,7 +7,7 @@ class AdoptXpathInWebsiteAgent < ActiveRecord::Migration
7 7
       agent.options['extract'].each { |name, extraction|
8 8
         case
9 9
         when extraction.delete('text')
10
-          extraction['value'] = 'text()'
10
+          extraction['value'] = './/text()'
11 11
         when attr = extraction.delete('attr')
12 12
           extraction['value'] = "@#{attr}"
13 13
         end

+ 2 - 2
spec/fixtures/agents.yml

@@ -10,8 +10,8 @@ jane_website_agent:
10 10
                  :expected_update_period_in_days => 2,
11 11
                  :mode => :on_change,
12 12
                  :extract => {
13
-                     :title => {:css => "item title", :value => 'text()'},
14
-                     :url => {:css => "item link", :value => 'text()'}
13
+                     :title => {:css => "item title", :value => './/text()'},
14
+                     :url => {:css => "item link", :value => './/text()'}
15 15
                  }
16 16
                }.to_json.inspect %>
17 17
 

+ 1 - 1
spec/models/agent_spec.rb

@@ -769,7 +769,7 @@ describe AgentDrop do
769 769
         mode: 'on_change',
770 770
         extract: {
771 771
           url: { css: '[id^=strip_enlarged_] img', value: '@src' },
772
-          title: { css: '.STR_DateStrip', value: 'text()' },
772
+          title: { css: '.STR_DateStrip', value: './/text()' },
773 773
         },
774 774
       },
775 775
       schedule: 'every_12h',

+ 19 - 1
spec/models/agents/website_agent_spec.rb

@@ -257,7 +257,6 @@ describe Agents::WebsiteAgent do
257 257
           'mode' => "on_change",
258 258
           'extract' => {
259 259
             'url' => {'css' => "#topLeft a", 'value' => "@href"},
260
-            'title' => {'css' => "#topLeft a", 'value' => "text()"}
261 260
           }
262 261
         }
263 262
         rel = Agents::WebsiteAgent.new(:name => "xkcd", :options => rel_site)
@@ -287,6 +286,25 @@ describe Agents::WebsiteAgent do
287 286
         event.payload['num_links'].should == "9"
288 287
       end
289 288
 
289
+      it "should return all texts concatenated if XPath returns many text nodes" do
290
+        rel_site = {
291
+          'name' => "XKCD",
292
+          'expected_update_period_in_days' => 2,
293
+          'type' => "html",
294
+          'url' => "http://xkcd.com",
295
+          'mode' => "on_change",
296
+          'extract' => {
297
+            'slogan' => {'css' => "#slogan", 'value' => ".//text()"}
298
+          }
299
+        }
300
+        rel = Agents::WebsiteAgent.new(:name => "xkcd", :options => rel_site)
301
+        rel.user = users(:bob)
302
+        rel.save!
303
+        rel.check
304
+        event = Event.last
305
+        event.payload['slogan'].should == "A webcomic of romance, sarcasm, math, and language."
306
+      end
307
+
290 308
       describe "JSON" do
291 309
         it "works with paths" do
292 310
           json = {